import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
import os
import sys
import gc
sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import setwd, fix_logging, display
from toolz.functoolz import pipe, partial
from multiprocessing import Pool
import seaborn as sns
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats
import itertools
setwd()
fix_logging(sc.settings)Working directory did not change because calling from nextflow.
%%R
library(conflicted)
conflict_prefer("Position", "base")
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(edgeR)
options(max.print=100)
options(repr.matrix.max.cols=50, repr.matrix.max.rows=6)R[write to console]: [conflicted] Will prefer base::Position over any other package
R[write to console]: Loading required package: magrittr
R[write to console]: Loading required package: limma
Index(['batch', 'samples', 'patient', 'origin', 'replicate', 'dataset',
'tumor_type', 'platform', 'age', 'sex', 'hpv_status', 'ir_status',
'facs_purity_cd3', 'facs_purity_cd56', 'n_genes', 'mt_frac', 'n_counts',
'rk_n_counts', 'rk_n_genes', 'rk_mt_frac', 'doublet_score',
'is_doublet', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_type',
'cell_type_unknown', 'cell_type_coarse'],
dtype='object')
Note that scikit-learn's randomized PCA might not be exactly reproducible across different computational platforms. For exact reproducibility, choose `svd_solver='arpack'.` This will likely become the Scanpy default in the future.
computing PCA with n_comps = 50
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
sc.pl.umap(adata, color=["samples", "cell_type", "hpv_status", "ir_status"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])(using combat with covariates did not work out -> singular matrix, i.e. too few samples per group)
sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
sc.pl.highly_variable_genes(adata)extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
sc.pl.umap(adata, color=["samples", "cell_type", "hpv_status", "ir_status"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.pp.highly_variable_genes(tmp_adata, flavor="cell_ranger", n_top_genes=2000)
sc.pl.highly_variable_genes(tmp_adata)###########################
CD4
###########################
extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
###########################
CD8
###########################
extracting highly variable genes
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
###########################
NK
###########################
extracting highly variable genes
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: divide by zero encountered in true_divide
) / disp_mad_bin[df['mean_bin'].values].values
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: invalid value encountered in true_divide
) / disp_mad_bin[df['mean_bin'].values].values
finished
added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.pp.pca(tmp_adata, svd_solver="arpack")
sc.pp.neighbors(tmp_adata, n_neighbors=10)
sc.tl.umap(tmp_adata)
sc.pl.umap(tmp_adata, color=["samples", "cell_type", "hpv_status", "ir_status"])###########################
CD4
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
###########################
CD8
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
###########################
NK
###########################
computing PCA with n_comps = 50
computing PCA on highly variable genes
finished
computing neighbors
using 'X_pca' with n_pcs = 50
finished
computing UMAP
finished
def test_leiden_thresholds(adata_key, resolutions, seeds, n_cpus):
"""
Test different leiden thresholds.
Args:
adata:
resolutions: numpy array containing all resolutions to test
seeds: numpy array containin random seeds (every resolution is tested with every seed)
p: multiprocessing.Pool
"""
args = list(itertools.product([adata_key], resolutions, seeds))
# p = lambda: None
# p.starmap = lambda x, a: [x for x in itertools.starmap(x, a)]
leiden_results = p.starmap(leiden_with_r, args)
n_clusters = [leiden_results[i].cat.categories.size for i, _ in enumerate(args)]
# re-arrange results in dataframe and aggregate by mean.
clusters = {s: dict() for s in seeds}
for i, (a, r, s) in enumerate(args):
clusters[s][r] = n_clusters[i]
clusters_mean = np.mean(pd.DataFrame.from_dict(clusters).values, axis=1)
return clusters_meanresolutions = np.arange(0.1, 1.5, 0.05)
seeds = np.arange(0, 10)
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
clusters_mean = test_leiden_thresholds(ct, resolutions, seeds, 16)
plt.plot(resolutions, clusters_mean)
# plt.plot(resolutions, pd.Series(clusters_mean).rolling(3), color="red")
plt.xlabel("leiden resolution")
plt.ylabel("#clusters")
plt.vlines(x=leiden_thres[ct], ymin=0, ymax=plt.ylim()[1], color="grey")
plt.show()###########################
CD4
###########################
###########################
CD8
###########################
###########################
NK
###########################
There is no clear plateau... anyway, 10 clusters sounds reasonable.
for ct, tmp_adata in adatas.items():
print("###########################\n{}\n###########################\n\n".format(ct))
sc.tl.leiden(tmp_adata, resolution=leiden_thres[ct])
sc.pl.umap(tmp_adata, color="leiden", legend_loc="on data")
sc.tl.rank_genes_groups(tmp_adata, groupby="leiden")
sc.pl.rank_genes_groups(tmp_adata)###########################
CD4
###########################
running Leiden clustering
finished
ranking genes
finished
###########################
CD8
###########################
running Leiden clustering
finished
ranking genes
finished
###########################
NK
###########################
running Leiden clustering
finished
ranking genes
finished
... storing 'cluster' as categorical
adata_edger = adata.copy()
# edgeR expects raw counts, normalization does not have any effect. We can therefore simply undo log1p
adata_edger.X = np.expm1(adata_edger.X)
hpv_map = {"HPV16+": "hpv_pos", "HPV-": "hpv_neg"}
ct_map = {
"T cell CD8+": "t_cd8",
"T cell regulatory": "t_reg",
"T cell CD4+ non-regulatory": "t_cd4",
"NK cell": "nk",
}
ir_map = {"IR+": "ir_pos", "IR-": "ir_neg"}
def remap(adata, col, dict_):
adata.obs[col] = [dict_.get(x, x) for x in adata.obs[col]]
remap(adata_edger, "hpv_status", hpv_map)
remap(adata_edger, "ir_status", ir_map)
remap(adata_edger, "cell_type", ct_map)[CD4_2, CD4_0, CD8_2, CD8_0, CD8_3, NK_0, CD4_1, CD8_1, NK_2, NK_1]
Categories (10, object): [CD4_2, CD4_0, CD8_2, CD8_0, ..., CD4_1, CD8_1, NK_2, NK_1]
As we compare between patients and not between cells, it seems advantageous to create "artificial bulk" samples. If we test on the single cells between patients, expression changes that are driven by a single patient (and might be batch effects) become significant.
The data in adata_edger is normalized and non-log-transformed (the log-transformation was undone above)
bobs = (
adata_edger.obs.groupby(["patient", "hpv_status", "ir_status"])
.agg(average_mito=("mt_frac", np.mean), total_reads=("n_counts", np.sum))
.reset_index()
.set_index("patient")
.dropna()
)
bulk_Xs = {}
tmp_cell_types = ["overall", "t_cd8", "t_cd4", "t_reg", "nk"]
for cell_type in tmp_cell_types:
ct_mask = (
(adata_edger.obs["cell_type"] == cell_type).values
if cell_type != "overall"
else True
)
bulk_Xs[cell_type] = OrderedDict()
for patient in bobs.index:
patient_mask = (adata_edger.obs["patient"] == patient).values
bulk_Xs[cell_type][patient] = np.mean(
adata_edger.X[ct_mask & patient_mask, :], axis=0
)
assert bulk_Xs[cell_type][patient].size == adata.shape[1]
# bdata = sc.AnnData(var = adata.var, )bdatas = []
for cell_type in tmp_cell_types:
tmp_colnames = np.hstack([n for n in bulk_Xs[cell_type].keys()])
assert np.all(tmp_colnames == bobs.index), "order of samples matches obs"
X = np.vstack([s for s in bulk_Xs[cell_type].values()])
bdatas.append(sc.AnnData(var=adata_edger.var, obs=bobs, X=X))%%R
#' make contrasts: one against all others.
#'
#' @param design design matrix
#' @param col_data colData or the SingleCellExperiment object.
#' @param column column name that is used for the contrasts. Also needs to be
#' specified as first variable in the model.matrix.
make_contrasts = function(design, col_data, column) {
n_clus = length(unique(col_data[[column]]))
upper_block = matrix(rep(-1/(n_clus-1), n_clus^2), ncol=n_clus)
diag(upper_block) = rep(1, n_clus)
lower_block = matrix(rep(0, (ncol(design)-n_clus) * n_clus), ncol=n_clus)
contrasts = rbind(upper_block, lower_block)
rownames(contrasts) = colnames(design)
colnames(contrasts) = colnames(design)[1:n_clus]
contrasts
}%%R
# per cell type
adata_bk = adata
cell_types = list("t_cd8", c("t_cd4", "t_reg"), "nk")
lapply(cell_types, function(ct) {
tmp_adata = adata_bk[, colData(adata_bk)$cell_type %in% ct]
colData(tmp_adata)$cluster = droplevels(colData(tmp_adata)$cluster)
print(dim(colData(tmp_adata)))
design = model.matrix(~0 + cluster + patient + n_genes + mt_frac, data=colData(tmp_adata))
print(dim(design))
contrasts = make_contrasts(design, colData(tmp_adata), "cluster")
adata=tmp_adata
save(adata, design, contrasts, file=paste0(de_dir, '/cluster_', ct[1], '.rda'), compress=FALSE)
})
adata = adata_bk[1]
8663
30
[1]
8663
18
[1]
11582
30
[1]
11582
17
[1]
2780
30
[1]
2780
17
%%R
# per cell type
adata_bk = adata
cell_types = c("t_cd8", "t_cd4", "t_reg", "nk")
lapply(cell_types, function(ct) {
tmp_adata = adata_bk[, colData(adata_bk)$cell_type == ct]
design = model.matrix(~0 + hpv_status + n_genes + mt_frac, data=colData(tmp_adata))
contrasts = makeContrasts(
overall = hpv_statushpv_pos - hpv_statushpv_neg,
levels = colnames(design)
)
adata=tmp_adata
save(adata, design, contrasts, file=paste0(de_dir, '/hpv_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk%%R
adata_bk = adata
lapply(names(bdatas), function(ct) {
bdata = bdatas[[ct]]
design = model.matrix(~0 + hpv_status + average_mito + total_reads, data=colData(bdata))
contrasts = makeContrasts(
overall = hpv_statushpv_pos - hpv_statushpv_neg,
levels = colnames(design)
)
adata=bdata
save(adata, design, contrasts, file=paste0(de_dir, '/bulk_hpv_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk%%R
adata_bk = adata
tmp_adata = adata[, colData(adata)$hpv_status == "hpv_pos"]
design = model.matrix(~0 + ir_status + cell_type + n_genes + mt_frac, data=colData(tmp_adata))
contrasts = makeContrasts(
overall = ir_statusir_pos - ir_statusir_neg,
levels = colnames(design)
)
adata = tmp_adata
save(adata, design, contrasts, file=paste0(de_dir, '/ir.rda'), compress=FALSE)
adata = adata_bk%%R
# per cell type
adata_bk = adata
cell_types = c("t_cd8", "t_cd4", "t_reg", "nk")
lapply(cell_types, function(ct) {
tmp_adata = adata[, colData(adata_bk)$cell_type == ct & colData(adata_bk)$hpv_status == "hpv_pos"]
design = model.matrix(~0 + ir_status + n_genes + mt_frac, data=colData(tmp_adata))
contrasts = makeContrasts(
overall = ir_statusir_pos - ir_statusir_neg,
levels = colnames(design)
)
adata=tmp_adata
save(adata, design, contrasts, file=paste0(de_dir, '/ir_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk%%R
adata_bk = adata
lapply(names(bdatas), function(ct) {
bdata = bdatas[[ct]]
design = model.matrix(~0 + ir_status + average_mito + total_reads, data=colData(bdata))
contrasts = makeContrasts(
overall = ir_statusir_pos - ir_statusir_neg,
levels = colnames(design)
)
adata=bdata
save(adata, design, contrasts, file=paste0(de_dir, '/bulk_ir_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk